knitr::opts_chunk$set(
fig.align = "center",
fig.pos = "h",
message = FALSE,
warning = FALSE,
out.width = "90%",
dpi = 300,
cache = TRUE
)
library(dplyr)
library(ggplot2)
library(ggtext)
library(stringr)
library(tidyr)
library(ngram)
library(extrafont)
library(grid)
library(gridExtra)
loadfonts()
newtheme <- theme_minimal(base_family = "Candara")+
theme(plot.title = element_markdown(size = 11, color = "grey30", margin = margin(b = -8)),
plot.subtitle = element_text(size = 9.5, color = "grey40", hjust = 1),
legend.position = "none",
panel.grid = element_blank(),
panel.spacing = unit(0, "mm"),
panel.background = element_blank(),
strip.background = element_blank(),
strip.text = element_text(size = 8.5, margin = margin(t = 2),
color = "grey60", family = "Haettenschweiler"),
axis.text = element_blank(),
axis.title = element_blank())
Legendarium_titles <- c("The Hobbit",
"The Fellowship of the Ring",
"The Two Towers",
"The Return of the King",
"The Silmarillion*",
"The Children of Húrin*",
"Of Tuor and His Coming to Gondolin*")
load("data/Legendarium.RData")
readLines_wrap <- function(c) {
readLines(paste0("data/", c), encoding = "UTF-8")%>%
as.data.frame() %>% `colnames<-`("Text")
}
Hobbit_raw <- readLines_wrap("The Hobbit.txt")
LoTR_1_raw <- readLines_wrap("01 - The Fellowship Of The Ring.txt")
LoTR_2_raw <- readLines_wrap("02 - The Two Towers.txt")
LoTR_3_raw <- readLines_wrap("03 - The Return Of The King.txt")
Sil_raw <- readLines_wrap("The Silmarillion.txt")
CoH_raw <- readLines_wrap("The Children of Hurin.txt")
Tuor_raw <- readLines_wrap("Of Tuor And His Coming To Gondolin.txt")
## Functions for Data Cleaning
Tolkien_cleaning <- function(text_raw) {
text_raw %>%
mutate(Text = trimws(Text))%>%
filter(Text != "")%>%
mutate(Text = str_remove_all(Text, "\\f"))
}
Tolkien_summarise <- function(text) {
text %>%
group_by(Chapter) %>%
mutate(Title = nth(Text, 2))%>%
slice(-1)%>%
filter(Text != Title, Text != as.character(Chapter))%>%
summarise(Title = unique(Title),
Text = paste(Text, collapse = " "),
Word_Count = sum(wordcount(Text)))
}
## Hobbit --------------------
Hobbit <- Hobbit_raw %>%
Tolkien_cleaning()%>%
slice(which(str_detect(Text, "Chapter 1")):n())%>%
mutate(Chapter = findInterval(1:n(), which(str_detect(Text, "Chapter [0-9]+"))) %>%
as.factor())%>%
Tolkien_summarise()
## LoTR ----------------------
LoTR_cleaning <- function(text_raw, Volume) {
text_raw %>%
Tolkien_cleaning() %>%
slice( min(which(str_detect(Text, "_Chapter 1_"))):
(min(which(str_detect(Text, "Here ends the [a-z]+ part|======"))) - 1))%>%
mutate(Book = ifelse(row_number() < max(which(Text == "_Chapter 1_")),
2 * Volume - 1, 2 * Volume),
Book = as.roman(Book)) %>%
group_by(Book) %>%
mutate(Chapter = paste0(Book, "-", findInterval(1:n(), which(str_detect(Text, "_Chapter [0-9]+_")))))%>%
mutate(Chapter = factor(Chapter, levels = str_sort(unique(Chapter), numeric = TRUE))) %>%
mutate(Text = str_remove_all(Text, "_"))%>%
Tolkien_summarise()
}
LoTR_1 <- LoTR_cleaning(LoTR_1_raw, Volume = 1)
LoTR_2 <- LoTR_cleaning(LoTR_2_raw, Volume = 2)
LoTR_3 <- LoTR_cleaning(LoTR_3_raw, Volume = 3)
## The Silmarillion -----------------
Sil <- Sil_raw %>%
Tolkien_cleaning()%>%
slice( which(Text == "AINULINDALË"):
max(which(Text == "NOTE ON PRONUNCIATION") - 1))%>%
mutate(Chapter = findInterval(1:n(), which(str_detect(Text, "Chapter [0-9]+"))))%>%
mutate(Chapter = case_when(row_number() < which(Text == "VALAQUENTA") ~ "AINU",
row_number() < which(Text == "QUENTA SILMARILLION") ~ "VALA",
row_number() >= which(Text == "OF THE RINGS OF POWER AND THE THIRD AGE") ~ "RINGS",
row_number() >= which(Text == "AKALLABÊTH") ~ "AKALLABÊTH",
TRUE ~ as.character(Chapter)))%>%
filter(Chapter != "0") %>%
mutate(Chapter = factor(Chapter, levels = c("AINU", "VALA", as.character(1:24),
"AKALLABÊTH", "RINGS")))%>%
Tolkien_summarise()
## The Children of Húrin ------------
CoH <- CoH_raw %>%
Tolkien_cleaning()%>%
filter(!str_detect(Text, "—"))%>%
slice( which(str_detect(Text, "CHAPTER I")):
(which(str_detect(Text, "APPENDIX")) - 1))%>%
mutate(Chapter = findInterval(1:n(), which(str_detect(Text, "CHAPTER "))))%>%
mutate(Chapter = as.factor(Chapter))%>%
Tolkien_summarise()
## Of Tuor And His Coming To Gondolin ------------
Tuor <- Tuor_raw %>%
Tolkien_cleaning() %>%
mutate(Chapter = "") %>%
group_by(Chapter) %>%
summarise(Text = paste(Text, collapse = " "))%>%
mutate(Word_Count = wordcount(Text))%>%
mutate(Chapter = as.factor(Chapter))
Legendarium_list <- list(Hobbit, LoTR_1, LoTR_2, LoTR_3, Sil, CoH, Tuor)%>%
`names<-`(Legendarium_titles)
lex_width <- sapply(Legendarium_list, function(df) {sum(df$Word_Count)})%>%
unname()
lex_matrix <- sapply(1:length(lex_width), function(x) {
out <- rep(x, max(lex_width))
if(x != which.max(lex_width)) {
out[(lex_width[x]+1):max(lex_width)] <- NA}
return(out)
}) %>% t()
#save(Legendarium_list, lex_width, lex_matrix, file = "data/Legendarium.RData")
Keyword_locate <- function(text, keyword_regex, exclude = NULL) {
split <- paste0("(?<=", tolower(keyword_regex), ")")
if(!is.null(exclude)) {
split <- paste0(split, "(?<!", tolower(exclude), ")")
}
split <- paste0(split, "\\S*")
loc <- tolower(text) %>%
str_split(split) %>%
unlist %>% trimws%>%
sapply(wordcount) %>%
unname()
loc <- loc[-length(loc)] %>% cumsum
if(length(loc) == 0) {NA}
else {loc}
}
Keyword_df <- function(text_df, keyword_regex, exclude = NULL) {
kword_loc <- sapply(text_df$Text, Keyword_locate,
keyword_regex = keyword_regex,
exclude = exclude)
if(typeof(kword_loc) == "list") { kword_times <- sapply(kword_loc, length) }
else { kword_times <- sum(length(kword_loc)) }
data.frame(Chapter = rep(text_df$Chapter, kword_times),
Keyword_Loc = unlist(kword_loc) %>% unname,
Keyword_Count = length(unlist(kword_loc)[!is.na(unlist(kword_loc))]))%>%
mutate(Chapter = factor(Chapter, levels(text_df$Chapter))) %>%
merge(text_df %>% select(-Text))%>%
mutate()
}
## Lexical Dispersion Plot
lex_dispersion <- function(title, Keyword_Loc_list) {
w_count <- formatC(sum(Legendarium_list[[title]]$Word_Count),
format = "f", big.mark = ",", digits = 0)
k_count <- Keyword_Loc_list[[title]][1, "Keyword_Count"]
plot_title <- paste0("***", title, "*** - <span style='color:tomato4'>",
k_count, ifelse(k_count <= 1, " Time", " Times"), "</span>")
plot_subtitle <- paste(w_count, "Words")
if(title == tail(Legendarium_titles, 1)) {
plot_title <- paste(plot_title, "<span style='font-family:Candara; font-size:9.5pt'>",
plot_subtitle, "</span>")
plot_subtitle <- " "
}
p <- ggplot(Keyword_Loc_list[[title]]) +
geom_rect(data = Legendarium_list[[title]] %>%
mutate(fill = (row_number() %% 2 == 1)),
aes(xmin = 0, xmax = Word_Count,
ymin = 0, ymax = 1, fill = fill))+
labs(title = plot_title, subtitle = plot_subtitle)+
scale_fill_manual(values = c("#d5d6d9", "#ededed"))+
coord_cartesian(expand = FALSE)+
facet_grid(~ Chapter, scales = "free_x", space = "free_x",
switch = "x")+
newtheme
if(k_count > 0) {
p <- p + geom_segment(aes(x = Keyword_Loc, xend = Keyword_Loc,
y = 0, yend = 1),
size = 0.5, color = "tomato3",
alpha = 0.6)
}
p
}
Legendarium_lex_dispersion <- function(keyword, keyword_regex, exclude = NULL) {
Keyword_Loc_list <- lapply(Legendarium_list, Keyword_df,
keyword_regex = keyword_regex,
exclude = exclude) %>%
`names<-`(Legendarium_titles)
lex_list <- lapply(Legendarium_titles, lex_dispersion,
Keyword_Loc_list = Keyword_Loc_list)
grid.arrange(
grobs = lex_list,
top = textGrob(c(toupper(paste0("The Appearance of the Word \"",
keyword, "\" in Tolkien's Legendarium\n")),
"* Books Edited by Christopher Tolkien\n"),
x = c(0.01, 0.99), hjust = c(0, 1),
gp = gpar(fontfamily = "Candara",
col = c("grey25", "grey50"), lineheight = 0.3,
fontsize = c(13, 10.5), fontface = c("bold", "plain"))),
bottom = textGrob("@akela",
hjust = 1, vjust = 0, x = 0.99, y = 1,
gp = gpar(fontfamily = "Candara", col = "grey65", fontsize = 10)),
layout_matrix = lex_matrix
)
}
Legendarium_lex_dispersion(
keyword = "hope\", \"hopeful\" or \"estel",
keyword_regex = "hope|estel",
exclude = "no hope|hopeless")
Legendarium_lex_dispersion(
keyword = "hopeless\", \"no hope\" or \"desperate",
keyword_regex = "hopeless|desperate|no hope|despair")
#### Doom/Fate
Legendarium_lex_dispersion(
keyword = "doom\" or \"fate",
keyword_regex = "\\bdoom\\b|\\bdooms\\b|fate")
Legendarium_lex_dispersion(
keyword = "tiding",
keyword_regex = "tiding")
Legendarium_lex_dispersion(
keyword = "hobbit\" or \"halfling",
keyword_regex = "hobbit|halfling")
Legendarium_lex_dispersion(
keyword = "elf",
keyword_regex = "\\belf\\b|\\belves\\b|\\belven\\b")
Legendarium_lex_dispersion(
keyword = "dwarf",
keyword_regex = "dwarf|dwarves|dwarven")
Legendarium_lex_dispersion(
keyword = "Orc\" or \"Goblin",
keyword_regex = "\\borc\\b|\\borcs\\b|goblin")
Legendarium_lex_dispersion(
keyword = "eagle",
keyword_regex = "eagle")
Legendarium_lex_dispersion(
keyword = "three\" or \"third" ,
keyword_regex = "\\bthree\\b|\\bthird\\b")
Legendarium_lex_dispersion(
keyword = "seven\" or \"seventh",
keyword_regex = "\\bseven\\b|seventh")
Legendarium_lex_dispersion(
keyword = "nine\" or \"ninth",
keyword_regex = "\\bnine\\b|ninth")